library(fivethirtyeight) # new to you!
library(moderndive)
library(skimr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(GGally) # new to you!
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(corrr)
The data we are interest is a subset of hate_crimes data set. The variables we are interested in have the following meanings:
avg_hatecrimes_per_100k_fbi: Average annual hate crimes per 100,000 population, FBI, 2010-2015
gini_index: Gini Index, 2015
share_pop_hs: Share of adults 25 and older with a high-school degree, 2009
share_vote_trump: Share of 2016 U.S. presidential voters who voted for Donald Trump
glimpse(hate_crimes)
## Observations: 51
## Variables: 12
## $ state <chr> "Alabama", "Alaska", "Arizona", "A...
## $ median_house_inc <int> 42278, 67629, 49254, 44922, 60487,...
## $ share_unemp_seas <dbl> 0.060, 0.064, 0.063, 0.052, 0.059,...
## $ share_pop_metro <dbl> 0.64, 0.63, 0.90, 0.69, 0.97, 0.80...
## $ share_pop_hs <dbl> 0.821, 0.914, 0.842, 0.824, 0.806,...
## $ share_non_citizen <dbl> 0.02, 0.04, 0.10, 0.04, 0.13, 0.06...
## $ share_white_poverty <dbl> 0.12, 0.06, 0.09, 0.12, 0.09, 0.07...
## $ gini_index <dbl> 0.472, 0.422, 0.455, 0.458, 0.471,...
## $ share_non_white <dbl> 0.35, 0.42, 0.49, 0.26, 0.61, 0.31...
## $ share_vote_trump <dbl> 0.63, 0.53, 0.50, 0.60, 0.33, 0.44...
## $ hate_crimes_per_100k_splc <dbl> 0.12583893, 0.14374012, 0.22531995...
## $ avg_hatecrimes_per_100k_fbi <dbl> 1.8064105, 1.6567001, 3.4139280, 0...
The data is made up of 12 variables, 11 numerical and 1 categorical, the state. There is one row per state.
hate_crimes %>%
count(state, sort = TRUE)
## # A tibble: 51 x 2
## state n
## <chr> <int>
## 1 Alabama 1
## 2 Alaska 1
## 3 Arizona 1
## 4 Arkansas 1
## 5 California 1
## 6 Colorado 1
## 7 Connecticut 1
## 8 Delaware 1
## 9 District of Columbia 1
## 10 Florida 1
## # ... with 41 more rows
From the table we can see that the data includes all 50 states and the District of Colombia.
crime <- select(hate_crimes,
avg_hatecrimes_per_100k_fbi,
share_pop_hs,
gini_index,
share_vote_trump)
skim(crime)
## Skim summary statistics
## n obs: 51
## n variables: 4
##
## ── Variable type:numeric ───────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## avg_hatecrimes_per_100k_fbi 1 50 51 2.37 1.71 0.27 1.29 1.99
## gini_index 0 51 51 0.45 0.021 0.42 0.44 0.45
## share_pop_hs 0 51 51 0.87 0.034 0.8 0.84 0.87
## share_vote_trump 0 51 51 0.49 0.12 0.04 0.41 0.49
## p75 p100 hist
## 3.18 10.95 ▇▇▅▁▁▁▁▁
## 0.47 0.53 ▅▅▇▇▁▁▁▁
## 0.9 0.92 ▂▅▅▃▃▅▇▆
## 0.57 0.7 ▁▁▁▃▇▇▆▃
ggplot(crime, aes(x = avg_hatecrimes_per_100k_fbi)) +
geom_density() +
labs(x = "", title = "Hate Crimes")
## Warning: Removed 1 rows containing non-finite values (stat_density).
# Histogram of share_pop_hs (IV):
ggplot(crime, aes(x = share_pop_hs)) +
geom_density() +
labs(x = "", title = "HS")
# Histogram of gini (IV):
ggplot(crime, aes(x = gini_index)) +
geom_density() +
labs(x = "", title = "Gini")
# Histogram of trump (IV):
ggplot(crime, aes(x = share_vote_trump)) +
geom_density() +
labs(x = "", title = "Trump")
hate_demo <- crime %>%
mutate(
cat_trump = case_when(
share_vote_trump < .5 ~ "less than half",
TRUE ~ "more than half"
)) %>%
mutate(cat_trump = as.factor(cat_trump)) %>%
select(-share_vote_trump)
Now lets look at the way each variable correlates to the variable of interest.
correlate(select(crime, avg_hatecrimes_per_100k_fbi, gini_index))
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
## rowname avg_hatecrimes_per_100k_fbi gini_index
## <chr> <dbl> <dbl>
## 1 avg_hatecrimes_per_100k_fbi NA 0.421
## 2 gini_index 0.421 NA
correlate(select(crime, avg_hatecrimes_per_100k_fbi, share_pop_hs))
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
## rowname avg_hatecrimes_per_100k_fbi share_pop_hs
## <chr> <dbl> <dbl>
## 1 avg_hatecrimes_per_100k_fbi NA 0.164
## 2 share_pop_hs 0.164 NA
correlate(select(crime, avg_hatecrimes_per_100k_fbi, share_vote_trump))
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
## rowname avg_hatecrimes_per_100k_fbi share_vote_trump
## <chr> <dbl> <dbl>
## 1 avg_hatecrimes_per_100k_fbi NA -0.505
## 2 share_vote_trump -0.505 NA
There is a 0.4212719 correlation between hate crimes and gini index value. There is a 0.1641476 correlation between hate crimes and high school grad rate. There is a 0.5045316 correlation between hate crimes and trump voter rate.
ggplot(crime, aes(x = share_pop_hs, y = avg_hatecrimes_per_100k_fbi)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(crime, aes(x = gini_index, y = avg_hatecrimes_per_100k_fbi)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(crime, aes(x = share_vote_trump, y = avg_hatecrimes_per_100k_fbi)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
ggpairs(hate_demo, aes(color = cat_trump))
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Now we will
hate_crime_model <- lm( avg_hatecrimes_per_100k_fbi ~ gini_index + share_pop_hs, data=crime)
get_regression_table(hate_crime_model)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept -54.0 9.76 -5.53 0 -73.7 -34.4
## 2 gini_index 64.3 11.1 5.79 0 42.0 86.7
## 3 share_pop_hs 31.3 6.81 4.59 0 17.6 45.0
summary(hate_crime_model)
##
## Call:
## lm(formula = avg_hatecrimes_per_100k_fbi ~ gini_index + share_pop_hs,
## data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7064 -0.9064 0.0390 0.7721 3.4995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -54.017 9.763 -5.533 1.36e-06 ***
## gini_index 64.324 11.106 5.792 5.55e-07 ***
## share_pop_hs 31.287 6.813 4.592 3.29e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.319 on 47 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4322, Adjusted R-squared: 0.408
## F-statistic: 17.89 on 2 and 47 DF, p-value: 1.673e-06
The regression table seems to indicate that when you combine the two independent variables into the model you end up with a very steep line. There is an intercept at -54 hate crimes, which is not possible, with a rise of 64 hate crimes for a movement of one in the positive gini index axis and a rise in 34 hate crimes for moving to 100 percent of the population HS educated.
This is drastically different than what we saw in the simple models, where there were not very steep lines cutting through the data. The multi- regression seems to have caused a steep cut up through the data.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
dim_scatter <- plot_ly(crime,
x = ~share_pop_hs,
y = ~gini_index,
z = ~avg_hatecrimes_per_100k_fbi) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'HS'),
yaxis = list(title = 'Gini'),
zaxis = list(title = 'Hate Crimes')))
dim_scatter
## Warning: Ignoring 1 observations